import re
from bs4 import BeautifulSoup
import requests
import csv
#curl
cookies = {
    'lianjia_uuid': 'b7557f89-a2b2-4cbc-ae42-57d362fbbc23',
    'crosSdkDT2019DeviceId': '-9qj3nn--1u6alf-splm1tunj2phukm-qaik6os49',
    'ftkrc_': 'c8ed2aa6-2296-4eab-a477-e0874b231199',
    'lfrc_': '650d1215-de42-40fa-a2ee-481dacc4c0cd',
    'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2218b6198e06c76f-088eea78a0913f-745d5771-1327104-18b6198e06d1360%22%2C%22%24device_id%22%3A%2218b6198e06c76f-088eea78a0913f-745d5771-1327104-18b6198e06d1360%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fcn.bing.com%2F%22%2C%22%24latest_referrer_host%22%3A%22cn.bing.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D',
    'select_city': '440100',
    'Hm_lvt_9152f8221cb6243a53c83b956842be8a': '1698149494,1698202724,1698293613,1698460209',
    'login_ucid': '2000000378662838',
    'lianjia_token': '2.0012477792414ff5c803ea5ea30f897f2c',
    'lianjia_token_secure': '2.0012477792414ff5c803ea5ea30f897f2c',
    'security_ticket': 'IewGAY2fA7+7XFTnN66MDmkdTaYh0GJVCTXSFf93FpRcGmHzOYEXYyH5aQisS4W9AW1Rrv8mIPV5YUvUIuvtUbNJGLbzpxg91L2lUaiUebVx/7dxiqkBLrpLgQ/PoCQJC25uGm00GPvVk0lgk/gNvQZTq46Gupyy+xlHt8elzQg=',
    'lianjia_ssid': '2e6126a1-7d5a-4978-b945-c8bf3c2a3c1f',
    'Hm_lpvt_9152f8221cb6243a53c83b956842be8a': '1698469879',
    'srcid': 'eyJ0Ijoie1wiZGF0YVwiOlwiYWVlMzZjYzkwMmU4ZGQ1MjkzZGQ5YmI2YmI0Nzc0ZTcxZWZlMGJkYzRjMmVlNTE2ODEwMjU1MDY4NDQwZWNlNmZlMWNmMWI4MmY2NWUyNmFhZDhmMmRlMmM1OTUyYTMzMjIxNzRmMzY5MDcyYjBlNmE1M2Y5ODJkMmMyNTgzMDUyMTViMmVmMGFmNzBlMThmZGVmN2IyODBkZmIzNTZiYjg5MTQ4YzRiMWY2OTc1ODU2MmNmN2E4NDFjZDM4NmQ1ZWE5YTQyOWMxNWYwZDBlY2VhZTQ3MDhmOTAzMzNkNGRjNjMxYzhkNzFhOWIzODY5ZTMwZmZjOWM1ZmM4ZmQxMlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIyMDEwODFmMFwifSIsInIiOiJodHRwczovL2d6LmtlLmNvbS9jaGVuZ2ppYW8vIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=',
}

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    # Requests sorts cookies= alphabetically
    # 'Cookie': 'lianjia_uuid=b7557f89-a2b2-4cbc-ae42-57d362fbbc23; crosSdkDT2019DeviceId=-9qj3nn--1u6alf-splm1tunj2phukm-qaik6os49; ftkrc_=c8ed2aa6-2296-4eab-a477-e0874b231199; lfrc_=650d1215-de42-40fa-a2ee-481dacc4c0cd; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218b6198e06c76f-088eea78a0913f-745d5771-1327104-18b6198e06d1360%22%2C%22%24device_id%22%3A%2218b6198e06c76f-088eea78a0913f-745d5771-1327104-18b6198e06d1360%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fcn.bing.com%2F%22%2C%22%24latest_referrer_host%22%3A%22cn.bing.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; select_city=440100; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1698149494,1698202724,1698293613,1698460209; login_ucid=2000000378662838; lianjia_token=2.0012477792414ff5c803ea5ea30f897f2c; lianjia_token_secure=2.0012477792414ff5c803ea5ea30f897f2c; security_ticket=IewGAY2fA7+7XFTnN66MDmkdTaYh0GJVCTXSFf93FpRcGmHzOYEXYyH5aQisS4W9AW1Rrv8mIPV5YUvUIuvtUbNJGLbzpxg91L2lUaiUebVx/7dxiqkBLrpLgQ/PoCQJC25uGm00GPvVk0lgk/gNvQZTq46Gupyy+xlHt8elzQg=; lianjia_ssid=2e6126a1-7d5a-4978-b945-c8bf3c2a3c1f; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1698469879; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiYWVlMzZjYzkwMmU4ZGQ1MjkzZGQ5YmI2YmI0Nzc0ZTcxZWZlMGJkYzRjMmVlNTE2ODEwMjU1MDY4NDQwZWNlNmZlMWNmMWI4MmY2NWUyNmFhZDhmMmRlMmM1OTUyYTMzMjIxNzRmMzY5MDcyYjBlNmE1M2Y5ODJkMmMyNTgzMDUyMTViMmVmMGFmNzBlMThmZGVmN2IyODBkZmIzNTZiYjg5MTQ4YzRiMWY2OTc1ODU2MmNmN2E4NDFjZDM4NmQ1ZWE5YTQyOWMxNWYwZDBlY2VhZTQ3MDhmOTAzMzNkNGRjNjMxYzhkNzFhOWIzODY5ZTMwZmZjOWM1ZmM4ZmQxMlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIyMDEwODFmMFwifSIsInIiOiJodHRwczovL2d6LmtlLmNvbS9jaGVuZ2ppYW8vIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=',
    'Referer': 'https://gz.ke.com/ershoufang/',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.69',
    'sec-ch-ua': '"Chromium";v="118", "Microsoft Edge";v="118", "Not=A?Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}

base_url = 'https://gz.ke.com/chengjiao/pg{}/'

# 创建CSV文件并写入表头
with open(r'C:\Users\LXH\Desktop\已成交的广州二手房数据\广州.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['标题', '房屋', '成交日期', '价格', '房屋信息', '单价', '房屋年限', '挂牌价格/成交周期'])

    # 循环爬取多个网页
    for i in range(1, 101):
        url = base_url.format(i)

        response = requests.get(url, cookies=cookies, headers=headers)
        html = response.text

        soup = BeautifulSoup(html, 'html.parser')

        infos = soup.find_all(class_='info')
        for info in infos:
            title = info.find(class_='title').text.strip()
            house_info = info.find(class_='houseInfo').text.strip()
            deal_date = info.find(class_='dealDate').text.strip()
            total_price = info.find(class_='totalPrice').text.strip()
            position_info = info.find(class_='positionInfo').text.strip()
            unit_price = info.find(class_='unitPrice').text.strip()
            deal_house_info = info.find(class_='dealHouseInfo').text.strip() if info.find(
                class_='dealHouseInfo') else ''
            deal_cycle_info = info.find(class_='dealCycleeInfo').text.strip()
            data = [re.sub(r'\s+', '', d) for d in [title, house_info, deal_date, total_price, position_info, unit_price, deal_house_info, deal_cycle_info]]
            writer.writerow(data)